description = [[
Spiders a website and attempts to match all pages and urls against a given
string. Matches are counted and grouped per url under which they were
discovered.
]]

---
-- @usage
-- nmap -p 80 www.example.com --script http-grep --script-args='http-grep.match="[A-Za-z0-9%.%%%+%-]+@[A-Za-z0-9%.%%%+%-]+%.%w%w%w?%w?",http-grep.breakonmatch'
--
-- @output
-- PORT   STATE SERVICE REASON
-- 80/tcp open  http    syn-ack
-- | http-grep: 
-- |   (4) http://example.com/name/
-- |     + name@example.com
-- |     + name@example.com
-- |     + name@example.com
-- |     + name@example.com
-- |   (4) http://example.com/sales.html
-- |     + sales@example.com
-- |     + sales@example.com
-- |     + sales@example.com
-- |__   + sales@example.com


-- @args http-grep.match the string to match in urls and page contents
-- @args http-grep.maxdepth the maximum amount of directories beneath
--       the initial url to spider. A negative value disables the limit.
--       (default: 3)
-- @args http-grep.maxpagecount the maximum amount of pages to visit.
--       A negative value disables the limit (default: 20)
-- @args http-grep.url the url to start spidering. This is a URL
--       relative to the scanned host eg. /default.html (default: /)
-- @args http-grep.withinhost only spider URLs within the same host.
--       (default: true)
-- @args http-grep.withindomain only spider URLs within the same
--       domain. This widens the scope from <code>withinhost</code> and can
--       not be used in combination. (default: false)

author = "Patrik Karlsson"
license = "Same as Nmap--See http://nmap.org/book/man-legal.html"
categories = {"discovery", "safe"}

require 'httpspider'
require 'shortport'
require 'url'

portrule = shortport.http

-- Shortens a matching string if it exceeds 60 characters
-- All characters after 60 will be replaced with ...
local function shortenMatch(match)
	if ( #match > 60 ) then
		return match:sub(1, 60) .. " ..."
	else
		return match
	end
end

action = function(host, port)

	-- read script specific arguments
	local match 			= stdnse.get_script_args("http-grep.match")
	local break_on_match 	= stdnse.get_script_args("http-grep.breakonmatch")
	
	if ( not(match) ) then
		return stdnse.format_output(true, "ERROR: Argument http-grep.match was not set")
	end
	
	local crawler = httpspider.Crawler:new(host, port, '/', { scriptname = SCRIPT_NAME } )
	local results = {}

	-- set timeout to 10 seconds
	crawler:set_timeout(10000)
	
	while(true) do
		local status, r = crawler:crawl()
		-- if the crawler fails it can be due to a number of different reasons
		-- most of them are "legitimate" and should not be reason to abort
		if ( not(status) ) then
			if ( r.err ) then
				return stdnse.format_output(true, "ERROR: %s", r.reason)
			else
				break
			end
		end

		local matches = {}
		local body = r.response.body
		-- try to match the url and body
		if ( body:match( match ) or tostring(r.url):match(match) ) then
			local count = select(2, body:gsub(match, match))
			for match in body:gmatch(match) do
				table.insert(matches, "+ " .. shortenMatch(match))
			end
			
			matches.name = ("(%d) %s"):format(count,tostring(r.url))
			table.insert(results, matches)
			
			-- should we continue to search for matches?
			if ( break_on_match ) then
				crawler:stop()
				break
			end
		end
	end
	table.sort(results, function(a,b) return a.name>b.name end)
	return stdnse.format_output(true, results)	
end